In [155]:
    
import requests
import bs4
Soup = bs4.BeautifulSoup
import csv
import re
import pandas as pd
import numpy as np
import math
from itertools import combinations
    
Importing website HTML data to be parsed
In [156]:
    
#To parse local HTML files
with open(r'C:/Users/ignacio.chavarria/Desktop/Scraping/aaa.html', "r") as f:
    content = f.read()
#To parse from web
#response = requests.get("http://dataquestio.github.io/web-scraping-pages/2014_super_bowl.html")
#content = response.content
    
In [157]:
    
with open(r"C:\Users\ignacio.chavarria\Desktop\Scraping\YaEsta\drinks.html", encoding="utf8") as f:
    content_d = f.read()
    
with open(r"C:\Users\ignacio.chavarria\Desktop\Scraping\YaEsta\snacks.html", "r", encoding="utf8") as f:
    content_s = f.read()
with open(r"C:\Users\ignacio.chavarria\Desktop\Scraping\YaEsta\chocolates.html", "r", encoding="utf8") as f:
    content_c = f.read()
    
In [158]:
    
p_d = Soup(content_d, 'html.parser')
p_s = Soup(content_s, 'html.parser')
p_c = Soup(content_c, 'html.parser')
    
Parsing titles and prices:
In [159]:
    
parsers = [p_d, p_s, p_c]
    
In [160]:
    
def cat_name(n):
    if n == 0:
        return "drink"
    elif n == 1:
        return "snack"
    elif n == 2:
        return "chocolate"
names = []
prices = []
categories = []
ct = 0
for parser in parsers:
    #Get product names
    names_raw = parser.select(".productName")
    for i in names_raw:
        names.append(i.text)
    #Get product prices
    prices_raw = parser.select(".prices")
    for i in prices_raw:
        if len(i) == 3:
            prices.append(float((i.text)[3:]))
        elif len(i) == 5:
            prices.append(float((i.find_all("span")[1].text)[1:]))
    #Get product categories
    cats = [ cat_name(ct) for i in range((len(categories)), len(prices))]
    for i in cats:
        categories.append(i)
    ct += 1
    
In [161]:
    
#Create dataframe
df = pd.DataFrame(
    {'name': names[:-9],
     'category': categories,
     'price': prices
    })
    
In [162]:
    
def amount(i):
    if re.findall('[ ]([0-9\,\.]+)(?=\s*[mMgG]([ lLrR\.]|\Z)([ \.]|\Z))', i):
        test2 = re.findall('[ ]([0-9\,\.]+)(?=\s*[mMgG]([ lLrR\.]|\Z)([ \.]|\Z))', i)
    else:
        test2 = re.findall('[ ]([0-9\,\.]+)(?=\s*[mMgG]([ lLrR\.]|\Z))', i)
    if test2:
        return float(test2[0][0].replace(",", "."))
    else:
        return math.nan
df['amount'] = df['name'].apply(lambda x: amount(x))
    
In [163]:
    
df = df.dropna(subset=['amount']).reset_index(drop=True)
    
In [164]:
    
#df['category'].value_counts()
    
In [165]:
    
df.shape
    
    Out[165]:
In [166]:
    
df['amount_per_dollar'] =  df['amount'] / df['price']
    
In [167]:
    
df['idx'] = df.index
    
In [168]:
    
gc = 5
    
In [169]:
    
#df.shape
    
In [170]:
    
df = df[df['price'] <= gc]
df.shape
    
    Out[170]:
In [171]:
    
#df['category'].value_counts()
    
In [172]:
    
df_c = df.loc[df['category'] == 'chocolate', :].sort_values('amount_per_dollar', ascending=[False])
df_d = df.loc[df['category'] == 'drink', :].sort_values('amount_per_dollar', ascending=[False])
df_s = df.loc[df['category'] == 'snack', :].sort_values('amount_per_dollar', ascending=[False])
#df_d = df.loc[lambda df: df.category == 'drink', :].sort_values('amount_per_dollar', ascending=[False])
#df_s = df[df['category'] == 'snack'].sort_values('amount_per_dollar', ascending=[False])
    
In [173]:
    
df_s1 = df_s.iloc[:10].sort_values('price', ascending=True)
df_d1 = df_d.iloc[:10].sort_values('price', ascending=True)
df_c1 = df_c.iloc[:10].sort_values('price', ascending=True)
    
In [174]:
    
dfs = [df_s1, df_d1, df_c1]
#Create dictionary with K values for computing combinations per df
d = {}
for n in range(len(dfs)):
    products = 0
    total = 0
    for i in range(dfs[n].shape[0]):
        if dfs[n].price.iloc[i] <= gc:
            total += dfs[n].price.iloc[i]
            if total >= gc:
                break
            products += 1
        else:
            break
    d[n] = int(products)
    
In [175]:
    
#Find top combinations per segment
top = {}
for y in range(len(dfs)):    
    combination_id = {}
    ti, tp, ta = [],[],[]
    for k in range(1, d[y] + 1):
        ti += list(combinations(dfs[y].idx, k))
        tp += list(combinations(dfs[y].price, k))
        ta += list(combinations(dfs[y].amount, k))
    for i in range(len(ti)):
        if sum(tp[i]) <= gc:
            combination_id[i] = []
            combination_id[i].append(list(ti[i]))
            combination_id[i].append(sum(tp[i]))
            combination_id[i].append(sum(ta[i]))
    top[y] = sorted(combination_id.items(), key=lambda x: x[1][2], reverse=True)[0][1:]
    
In [176]:
    
top
    
    Out[176]:
In [177]:
    
labels = ["'snack'", "'drink'", "'candy'"]
def gr_or_ml(category):
    if category == "'drink'":
        return "Total milliliters:"
    else:
        return "Total grams:"
for i in top:
    print('Basket:', labels[i])
    for x in top[i][0][0]:
        print(" ", df.loc[lambda df: df.idx == x, 'name'].item())
    print('Total cost: $', top[i][0][1])
    print(gr_or_ml(labels[i]), top[i][0][2])
    print("\n")
    
    
In [226]:
    
from sklearn.utils import shuffle
original_dfs = [df_s, df_d, df_c]
results_dict = {}
random_baskets = 100000
for df_no in range(len(original_dfs)):
    random_results = []
    min_price = original_dfs[df_no].sort_values('price', ascending=True)['price'].iloc[0]
    for i in range(random_baskets):
        random_df = shuffle(original_dfs[df_no])#.sample(frac=1)
        random_basket = []
        balance = gc
        amount = 0
        for n in range(random_df.shape[0]):
            if balance - random_df.price.iloc[n] >= 0:
                balance -= random_df.price.iloc[n]
                amount += random_df.amount.iloc[n]
                random_basket.append(random_df.name.iloc[n])
            elif balance < min_price:
                break
            else:
                pass
        random_results.append(amount)
    results_dict[df_no] = random_results
    
In [230]:
    
print(max(results_dict[1]))
print(top[1][0][2])
print(len([x for x in results_dict[1] if x >= top[1][0][2]]))
    
    
In [231]:
    
for i in range(len(labels)):
    print("Random", labels[i], "baskets tie or beat the optimized basket", "%.2f%%" % (100 * 
          len([x for x in results_dict[i] if x >= top[i][0][2]]) / random_baskets), "of the time.")
    
    
In [232]:
    
optimized_amount_snack = top[0][0][2]
optimized_amount_drink = top[1][0][2]
optimized_amount_candy = top[2][0][2]
opt_baskets = [optimized_amount_snack, optimized_amount_drink, optimized_amount_candy]
for i in range(len(opt_baskets)):
    print("The optimized", labels[i], "basket is over", 
          math.floor((opt_baskets[i] - np.mean(results_dict[i])) / np.std(results_dict[i])), 
          "standard deviations away from the random basket mean")
    #print(np.mean(results_dict[i]) + (np.std(results_dict[i]) * 3))
    
    
In [221]:
    
print(opt_baskets[0], np.mean(results_dict[0]), np.std(results_dict[0]))
    
    
In [237]:
    
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 8})
fig, ax = plt.subplots(figsize=(6.2,5))
sns.set_style("white")
plt.subplot(2,2,1)
plt.title('Snacks')
plt.hist(results_dict[0], color='#F04824')
plt.axvline(np.mean(results_dict[0]), color='y', linewidth=2)
plt.axvline(optimized_amount_snack-4, color='y', linestyle='dashed', linewidth=2)
sns.despine(right=True)
plt.subplot(2,2,2)
plt.title('Drinks')
plt.hist(results_dict[1], color='#F04824')
plt.axvline(np.mean(results_dict[1]), color='y', linewidth=2)
plt.axvline(optimized_amount_drink, color='y', linestyle='dashed', linewidth=2)
sns.despine(right=True)
plt.subplot(2,2,3)
plt.title('Candy')
plt.hist(results_dict[2], color='#F04824')
plt.axvline(np.mean(results_dict[2]), color='y', linewidth=2)
plt.axvline(optimized_amount_candy-2, color='y', linestyle='dashed', linewidth=2)
sns.despine(right=True)
plt.tight_layout()
plt.show()
    
    
In [239]:
    
k = 6
drink_items = df.loc[df['category'] == 'drink', 'name']
print("There are", drink_items.shape[0], "total items in the 'drink' category with", 
      (len(list(combinations(drink_items, k)))), "million combinations at k =", k, ".")